library(data.table)
library(ggplot2)
library(dplyr)
library(ggrepel)
This R Markdown creates a Manhattan plot where each gene is represented by its lead cis-eQTL (minimum normial p values). It also plots a TSS distance histogram.
ciseQTLs = fread(params$ciseQTLs_path)
location = fread(params$snpsloc_path)
# merge
data = merge(location, ciseQTLs, by = "snp", all = FALSE)
# Standardise columns
if (!("chr" %in% names(data))) {
# try alternatives
if ("CHR" %in% names(data)) data$chr <- data$CHR
}
data$name = paste0(data$snp, "_", data$gene_name)
keep_cols <- intersect(c("chr","pos","snp","gene_name","name","p"), names(data))
data = data[, ..keep_cols]
setnames(data, names(data), tolower(names(data)))
setnames(data, c("chr","pos","snp","gene_name","name","p"), c("chr","bp","risd","gene_name","snp_id","p"), skip_absent=TRUE)
# coerce types
data$chr <- as.numeric(gsub("chr", "", as.character(data$chr)))
data$bp <- as.numeric(as.character(data$bp))
data <- data[!is.na(data$chr) & !is.na(data$bp)]
if (!file.exists(params$geneloc_path)) stop("geneloc file not found: ", params$geneloc_path)
geneloc <- fread(params$geneloc_path)
# pick gene_name column robustly
if ("V7" %in% names(geneloc) & "V8" %in% names(geneloc)) {
geneloc$gene_name = ifelse(geneloc$V7 %in% c("KNOWN","NOVEL"), geneloc$V8, geneloc$V7)
} else if ("gene_name" %in% names(geneloc)) {
geneloc$gene_name = geneloc$gene_name
} else {
stop("Unexpected geneloc format — cannot determine gene name column")
}
geneloc = geneloc[!duplicated(geneloc$gene_name),]
yy = merge(data, geneloc, by = "gene_name", all.x = TRUE)
if (nrow(yy) == 0) stop("Merge with gene locations produced 0 rows — check gene names")
# compute TSS depending on strand column V5 or strand if present
strand_col <- if ("V5" %in% names(yy)) "V5" else if ("strand" %in% names(yy)) "strand" else NA
start_col <- if ("V2" %in% names(yy)) "V2" else if ("start" %in% names(yy)) "start" else NA
end_col <- if ("V3" %in% names(yy)) "V3" else if ("end" %in% names(yy)) "end" else NA
if (is.na(strand_col) | is.na(start_col) | is.na(end_col)) {
warning("Gene location file missing expected columns for TSS calculation. TSS will be NA where not computable.")
yy$gene_tss <- NA
} else {
yy$gene_tss = ifelse(yy[[strand_col]] == "+", yy[[start_col]], yy[[end_col]])
}
yy$distance_kb = (yy$gene_tss - yy$bp) / 1000
yy$distance_bp = yy$gene_tss - yy$bp
save_table = yy[, .(gene_name, chr, bp, risd, snp_id, gene_tss, distance_bp)]
setnames(save_table, c("gene_name","chr","bp","risd","snp_id","gene_tss","distance_bp"),
c("gene_name","CHR","BP(hg19)","SNP","Pair","gene_tss","distance(bp)"))
#write.table(save_table, "PC47_LMM.resul_TSS.distance.txt", row.names = FALSE, quote = FALSE, sep = "\t")
cat("Wrote PC47_LMM.resul_TSS.distance.txt with", nrow(save_table), "rows\n")
## Wrote PC47_LMM.resul_TSS.distance.txt with 1160107 rows
dtss = yy$distance_kb
dtss = dtss[!is.na(dtss)]
if (length(dtss) == 0) {
plot.new(); text(0.5,0.5,"No TSS distances available")
} else {
h <- hist(dtss, breaks=100, col="#c77cff",
main="Distance distribution of cis-eQTLs",
xlab="Distance between eQTL and TSS (KB)")
xfit <- seq(min(dtss), max(dtss), length=40)
yfit <- dnorm(xfit, mean=mean(dtss), sd=sd(dtss))
yfit <- yfit * diff(h$mids[1:2]) * length(dtss)
lines(xfit, yfit, col="grey", lwd=2)
}
sig_cut <- 0.001
sig.dat <- data %>% filter(p < sig_cut)
notsig.dat <- data %>% filter(p >= sig_cut)
prop <- params$subsample_non_sig_prop
if (nrow(notsig.dat) > 0) {
notsig.sample <- notsig.dat %>% slice_sample(prop = prop)
data2 <- bind_rows(sig.dat, notsig.sample)
} else {
data2 <- data
}
data2$BP <- as.numeric(data2$bp)
data2$CHR <- as.integer(data2$chr)
don <- data2 %>%
group_by(CHR) %>%
summarise(chr_len = max(BP, na.rm = TRUE)) %>%
mutate(tot = cumsum(chr_len) - chr_len) %>%
select(-chr_len) %>%
left_join(data2, ., by = c("CHR" = "CHR")) %>%
arrange(CHR, BP) %>%
mutate(BPcum = BP + tot)
axisdf = don %>% group_by(CHR) %>% summarize(center = (max(BPcum, na.rm=TRUE) + min(BPcum, na.rm=TRUE)) / 2)
if (file.exists(params$TST_genes_path)) {
TST <- fread(params$TST_genes_path)
if ("MaxFC" %in% names(TST)) TST <- TST[TST$MaxFC >= 0.58, ]
if ("HGNC" %in% names(TST)) TST$gene_name = TST$HGNC
} else {
TST <- data.table(gene_name = character(0))
}
# pick lead SNP per gene (minimum p-value)
don$p <- as.numeric(as.character(don$p))
cat(print(min(don$p, na.rm=TRUE)))
## [1] 4.265115e-153
## 4.265115e-153
don2 <- setDT(don)[, .SD[which.min(p)], by = gene_name]
cat("Lead SNPs selected:", nrow(don2), "unique genes\n")
## Lead SNPs selected: 17033 unique genes
cat("Minimum p-value in plot data:", min(don2$p, na.rm = TRUE), "\n")
## Minimum p-value in plot data: 4.265115e-153
don2$p <- as.numeric(as.character(don2$p))
don2$log10p <- -log10(don2$p)
g <- ggplot(don2, aes(x = BPcum, y = log10p)) +
geom_point(aes(color = as.factor(CHR)), alpha = 0.6, size = 0.6) +
scale_color_manual(values = rep(c("grey", "black"), 22)) +
scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
ylab(expression("-Log"["10"]~italic("P"))) + xlab("Genomic position (Chr)") +
theme_bw() +
theme(legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 30, hjust = 1),
plot.title = element_text(face = "bold")) +
ggtitle("HIRV_LMM lead cis-eQTLs (lead SNP per gene)")+
geom_point(data = don2[don2$gene_name %in% TST$gene_name,], col = "red", alpha = 1, size = 0.6) +
geom_text_repel(data = don2[don2$gene_name %in% TST$gene_name & don2$p < 1e-50,],
aes(label = gene_name), size = 3.5, min.segment.length = 0, max.overlaps = Inf)
print(g)